Import required packages¶
import pickle
from pathlib import Path
import dtreeviz
import ee
import geemap
import geojson
import lightgbm as lgb
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import plotly.express as px
import scipy.interpolate
import shap
from cuml.explainer import PermutationExplainer
from cuml.model_selection import train_test_split as cuml_train_test_split
from optuna.integration.lightgbm import LightGBMTunerCV
from optuna_integration import LightGBMPruningCallback
from plotly import graph_objects as go
from sklearn.metrics import (
accuracy_score,
precision_recall_fscore_support,
roc_auc_score,
)
from sklearn.model_selection import train_test_split
Second enviroment import due to environment conflict between RAPIDSAI and Pytorch-tabnet
# import lightgbm as lgb
# import numpy as np
# import pandas as pd
# import torch
# from pytorch_tabnet.tab_model import TabNetClassifier
# from sklearn.linear_model import LogisticRegression
# from sklearn.metrics import (
# accuracy_score,
# precision_recall_fscore_support,
# roc_auc_score,
# )
# from sklearn.model_selection import train_test_split
ee.Authenticate()
ee.Initialize()
Map = geemap.Map()
Map.add_basemap("HYBRID")
Exploratory Data Analysis (EDA)¶
Data Collection¶
flood_collection: ee.ImageCollection = ee.ImageCollection(
"GLOBAL_FLOOD_DB/MODIS_EVENTS/V1"
)
landsat_collection: ee.ImageCollection = ee.ImageCollection(
"LANDSAT/LE07/C02/T1_L2"
).filterDate("2000-02-17", "2018-12-10")
elevation_image = ee.Image("CGIAR/SRTM90_V4")
print("Image structure of each collection:")
display(flood_collection.first(), landsat_collection.first(), elevation_image)
Image structure of each collection:
- type:Image
- id:GLOBAL_FLOOD_DB/MODIS_EVENTS/V1/DFO_1586_From_20000218_to_20000301
- version:1685079690200045
- id:flooded
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:255
- min:0
- precision:int
- 0:5686
- 1:8984
- id:duration
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:65535
- min:0
- precision:int
- 0:5686
- 1:8984
- id:clear_views
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:65535
- min:0
- precision:int
- 0:5686
- 1:8984
- id:clear_perc
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- precision:float
- 0:5686
- 1:8984
- id:jrc_perm_water
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:255
- min:0
- precision:int
- 0:5686
- 1:8984
- id:1586
- cc:AUS
- composite_type:3Day
- countries:Australia
- dfo_centroid_x:143.6978
- dfo_centroid_y:-31.268059
- dfo_country:Australia
- dfo_dead:1
- dfo_displaced:200
- dfo_main_cause:Monsoonal rain
- dfo_other_country:NA
- dfo_severity:2
- dfo_validation_type:News
- gfd_country_code:['AS']
- gfd_country_name:['AUSTRALIA']
- glide_index:NA
- otsu_sample_res:231.66
- slope_threshold:5
- system:asset_size:8988914
- type:LinearRing
- 0:148.77567883770345
- 1:-37.692190384571404
- 0:149.9754567941598
- 1:-37.692185550952246
- 0:149.9749728427817
- 1:-17.513770729195038
- 0:149.17465716800353
- 1:-17.51377546120134
- 0:147.97772223305088
- 1:-17.512579461230274
- 0:146.3818089465066
- 1:-17.512579422994158
- 0:144.7858956959944
- 1:-17.512579420723743
- 0:143.1899824714047
- 1:-17.512579400063284
- 0:141.59406922326394
- 1:-17.512579402044587
- 0:139.99815600423668
- 1:-17.5125794432258
- 0:138.40224272344366
- 1:-17.512579385808014
- 0:137.20294874528088
- 1:-17.51377068144409
- 0:137.20246484353038
- 1:-37.69218554664975
- 0:139.59917774865693
- 1:-37.692190354915816
- 0:141.1950909277643
- 1:-37.69219034319341
- 0:143.58896079571315
- 1:-37.69219036366114
- 0:145.18487408134828
- 1:-37.69219039048359
- 0:147.17976561083674
- 1:-37.692190365837604
- 0:148.77567883770345
- 1:-37.692190384571404
- system:index:DFO_1586_From_20000218_to_20000301
- system:time_end:951868800000
- system:time_start:950832000000
- threshold_b1b2:0.711
- threshold_b7:1815.18
- threshold_type:otsu
- type:Image
- id:LANDSAT/LE07/C02/T1_L2/LE07_001004_20000610
- version:1612545591275537
- id:SR_B1
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:65535
- min:0
- precision:int
- 0:9101
- 1:9011
- id:SR_B2
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:65535
- min:0
- precision:int
- 0:9101
- 1:9011
- id:SR_B3
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:65535
- min:0
- precision:int
- 0:9101
- 1:9011
- id:SR_B4
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:65535
- min:0
- precision:int
- 0:9101
- 1:9011
- id:SR_B5
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:65535
- min:0
- precision:int
- 0:9101
- 1:9011
- id:SR_B7
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:65535
- min:0
- precision:int
- 0:9101
- 1:9011
- id:SR_ATMOS_OPACITY
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:32767
- min:-32768
- precision:int
- 0:9101
- 1:9011
- id:SR_CLOUD_QA
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:255
- min:0
- precision:int
- 0:9101
- 1:9011
- id:ST_B6
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:65535
- min:0
- precision:int
- 0:9101
- 1:9011
- id:ST_ATRAN
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:32767
- min:-32768
- precision:int
- 0:9101
- 1:9011
- id:ST_CDIST
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:32767
- min:-32768
- precision:int
- 0:9101
- 1:9011
- id:ST_DRAD
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:32767
- min:-32768
- precision:int
- 0:9101
- 1:9011
- id:ST_EMIS
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:32767
- min:-32768
- precision:int
- 0:9101
- 1:9011
- id:ST_EMSD
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:32767
- min:-32768
- precision:int
- 0:9101
- 1:9011
- id:ST_QA
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:32767
- min:-32768
- precision:int
- 0:9101
- 1:9011
- id:ST_TRAD
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:32767
- min:-32768
- precision:int
- 0:9101
- 1:9011
- id:ST_URAD
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:32767
- min:-32768
- precision:int
- 0:9101
- 1:9011
- id:QA_PIXEL
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:65535
- min:0
- precision:int
- 0:9101
- 1:9011
- id:QA_RADSAT
- crs:EPSG:32628
- 0:30
- 1:0
- 2:337185
- 3:0
- 4:-30
- 5:8808915
- type:PixelType
- max:65535
- min:0
- precision:int
- 0:9101
- 1:9011
- ALGORITHM_SOURCE_SURFACE_REFLECTANCE:LEDAPS_3.4.0
- ALGORITHM_SOURCE_SURFACE_TEMPERATURE:st_1.3.0
- CLOUD_COVER:0
- CLOUD_COVER_LAND:0
- COLLECTION_CATEGORY:T1
- COLLECTION_NUMBER:2
- CORRECTION_BIAS_BAND_1:INTERNAL_CALIBRATION
- CORRECTION_BIAS_BAND_2:INTERNAL_CALIBRATION
- CORRECTION_BIAS_BAND_3:INTERNAL_CALIBRATION
- CORRECTION_BIAS_BAND_4:INTERNAL_CALIBRATION
- CORRECTION_BIAS_BAND_5:INTERNAL_CALIBRATION
- CORRECTION_BIAS_BAND_6_VCID_1:INTERNAL_CALIBRATION
- CORRECTION_BIAS_BAND_6_VCID_2:INTERNAL_CALIBRATION
- CORRECTION_BIAS_BAND_7:INTERNAL_CALIBRATION
- CORRECTION_BIAS_BAND_8:INTERNAL_CALIBRATION
- CORRECTION_GAIN_BAND_1:CPF
- CORRECTION_GAIN_BAND_2:CPF
- CORRECTION_GAIN_BAND_3:CPF
- CORRECTION_GAIN_BAND_4:CPF
- CORRECTION_GAIN_BAND_5:CPF
- CORRECTION_GAIN_BAND_6_VCID_1:CPF
- CORRECTION_GAIN_BAND_6_VCID_2:CPF
- CORRECTION_GAIN_BAND_7:CPF
- CORRECTION_GAIN_BAND_8:CPF
- DATA_SOURCE_AIR_TEMPERATURE:NCEP
- DATA_SOURCE_ELEVATION:GLS2000
- DATA_SOURCE_OZONE:TOMS
- DATA_SOURCE_PRESSURE:NCEP
- DATA_SOURCE_REANALYSIS:GEOS-5 FP-IT
- DATA_SOURCE_WATER_VAPOR:NCEP
- DATE_ACQUIRED:2000-06-10
- DATE_PRODUCT_GENERATED:1600413331000
- DATUM:WGS84
- EARTH_SUN_DISTANCE:1.0153425
- ELLIPSOID:WGS84
- EPHEMERIS_TYPE:DEFINITIVE
- GAIN_BAND_1:H
- GAIN_BAND_2:H
- GAIN_BAND_3:H
- GAIN_BAND_4:H
- GAIN_BAND_5:H
- GAIN_BAND_6_VCID_1:L
- GAIN_BAND_6_VCID_2:H
- GAIN_BAND_7:H
- GAIN_BAND_8:L
- GAIN_CHANGE_BAND_1:HH
- GAIN_CHANGE_BAND_2:HH
- GAIN_CHANGE_BAND_3:HH
- GAIN_CHANGE_BAND_4:HH
- GAIN_CHANGE_BAND_5:HH
- GAIN_CHANGE_BAND_6_VCID_1:LL
- GAIN_CHANGE_BAND_6_VCID_2:HH
- GAIN_CHANGE_BAND_7:HH
- GAIN_CHANGE_BAND_8:LL
- GAIN_CHANGE_SCAN_BAND_1:0
- GAIN_CHANGE_SCAN_BAND_2:0
- GAIN_CHANGE_SCAN_BAND_3:0
- GAIN_CHANGE_SCAN_BAND_4:0
- GAIN_CHANGE_SCAN_BAND_5:0
- GAIN_CHANGE_SCAN_BAND_6_VCID_1:0
- GAIN_CHANGE_SCAN_BAND_6_VCID_2:0
- GAIN_CHANGE_SCAN_BAND_7:0
- GAIN_CHANGE_SCAN_BAND_8:0
- GEOMETRIC_RMSE_MODEL:9.307
- GEOMETRIC_RMSE_MODEL_X:5.995
- GEOMETRIC_RMSE_MODEL_Y:7.119
- GRID_CELL_SIZE_REFLECTIVE:30
- GRID_CELL_SIZE_THERMAL:30
- GROUND_CONTROL_POINTS_MODEL:23
- GROUND_CONTROL_POINTS_VERSION:5
- IMAGE_QUALITY:9
- L1_DATE_PRODUCT_GENERATED:2020-09-18T06:45:47Z
- L1_LANDSAT_PRODUCT_ID:LE07_L1TP_001004_20000610_20200918_02_T1
- L1_PROCESSING_LEVEL:L1TP
- L1_PROCESSING_SOFTWARE_VERSION:LPGS_15.3.1c
- L1_REQUEST_ID:L2
- LANDSAT_PRODUCT_ID:LE07_L2SP_001004_20000610_20200918_02_T1
- LANDSAT_SCENE_ID:LE70010042000162SGS00
- MAP_PROJECTION:UTM
- ORIENTATION:NORTH_UP
- PROCESSING_LEVEL:L2SP
- PROCESSING_SOFTWARE_VERSION:LPGS_15.3.1c
- REFLECTANCE_ADD_BAND_1:-0.2
- REFLECTANCE_ADD_BAND_2:-0.2
- REFLECTANCE_ADD_BAND_3:-0.2
- REFLECTANCE_ADD_BAND_4:-0.2
- REFLECTANCE_ADD_BAND_5:-0.2
- REFLECTANCE_ADD_BAND_7:-0.2
- REFLECTANCE_MULT_BAND_1:2.75e-05
- REFLECTANCE_MULT_BAND_2:2.75e-05
- REFLECTANCE_MULT_BAND_3:2.75e-05
- REFLECTANCE_MULT_BAND_4:2.75e-05
- REFLECTANCE_MULT_BAND_5:2.75e-05
- REFLECTANCE_MULT_BAND_7:2.75e-05
- REFLECTIVE_LINES:9011
- REFLECTIVE_SAMPLES:9101
- REQUEST_ID:L2
- SATURATION_BAND_1:Y
- SATURATION_BAND_2:Y
- SATURATION_BAND_3:Y
- SATURATION_BAND_4:Y
- SATURATION_BAND_5:N
- SATURATION_BAND_6_VCID_1:N
- SATURATION_BAND_6_VCID_2:N
- SATURATION_BAND_7:N
- SATURATION_BAND_8:N
- SCENE_CENTER_TIME:14:00:14.7523815Z
- SENSOR_ANOMALIES:NONE
- SENSOR_ID:ETM
- SENSOR_MODE:SAM
- SENSOR_MODE_SLC:ON
- SPACECRAFT_ID:LANDSAT_7
- STATION_ID:SGS
- SUN_AZIMUTH:-164.3367295
- SUN_ELEVATION:34.53465096
- TEMPERATURE_ADD_BAND_ST_B6:149
- TEMPERATURE_MAXIMUM_BAND_ST_B6:372.999941
- TEMPERATURE_MINIMUM_BAND_ST_B6:149.003418
- TEMPERATURE_MULT_BAND_ST_B6:0.00341802
- THERMAL_LINES:9011
- THERMAL_SAMPLES:9101
- UTM_ZONE:28
- WRS_PATH:1
- WRS_ROW:4
- WRS_TYPE:2
- system:asset_size:263744119
- type:LinearRing
- 0:-10.798987380789246
- 1:78.01858268319442
- 0:-10.411971358532213
- 1:78.087262939023
- 0:-10.405192241584777
- 1:78.09033737226109
- 0:-10.45002686192686
- 1:78.10115918140148
- 0:-13.32072524601852
- 1:78.7326873858347
- 0:-15.245031539703183
- 1:79.10582796599041
- 0:-16.33166698497307
- 1:79.30080490370658
- 0:-16.352886368757954
- 1:79.29918418754635
- 0:-16.38707288609935
- 1:79.29308175095046
- 0:-21.80626662764263
- 1:78.07770870627853
- 0:-21.83782445545615
- 1:78.0686748141708
- 0:-21.846477474236046
- 1:78.0643237332263
- 0:-21.724798617349382
- 1:78.0443847852502
- 0:-18.016971684322044
- 1:77.38452706148878
- 0:-15.977762221404255
- 1:76.96665695958274
- 0:-15.96255873558572
- 1:76.96844307081405
- 0:-15.920977871524345
- 1:76.97729523454795
- 0:-14.553805105872115
- 1:77.28118851198714
- 0:-13.06187389601564
- 1:77.59055217371134
- 0:-11.819086610724035
- 1:77.83121753786516
- 0:-11.497264139900494
- 1:77.89133707865153
- 0:-10.798987380789246
- 1:78.01858268319442
- system:index:LE07_001004_20000610
- system:time_end:960645614752
- system:time_start:960645614752
- type:Image
- id:CGIAR/SRTM90_V4
- version:1641990053291277
- id:elevation
- crs:EPSG:4326
- 0:0.000833333333333
- 1:0
- 2:-180
- 3:0
- 4:-0.000833333333333
- 5:60
- type:PixelType
- max:32767
- min:-32768
- precision:int
- 0:432000
- 1:144000
- 0:950227200000
- 1:951177600000
- description:
The Shuttle Radar Topography Mission (SRTM) digital elevation dataset was originally produced to provide consistent, high-quality elevation data at near global scope. This version of the SRTM digital elevation data has been processed to fill data voids, and to facilitate its ease of use.
Provider: NASA/CGIAR
Bands
Name Description elevation Elevation
Terms of Use
DISTRIBUTION. Users are prohibited from any commercial, non-free resale, or redistribution without explicit written permission from CIAT. Users should acknowledge CIAT as the source used in the creation of any reports, publications, new datasets, derived products, or services resulting from the use of this dataset. CIAT also request reprints of any publications and notification of any redistributing efforts. For commercial access to the data, send requests to Andy Jarvis.
NO WARRANTY OR LIABILITY. CIAT provides these data without any warranty of any kind whatsoever, either express or implied, including warranties of merchantability and fitness for a particular purpose. CIAT shall not be liable for incidental, consequential, or special damages arising out of the use of any data.
ACKNOWLEDGMENT AND CITATION. Any users are kindly asked to cite this data in any published material produced using this data, and if possible link web pages to the CIAT-CSI SRTM website.
Suggested citation(s)
Jarvis, A., H.I. Reuter, A. Nelson, E. Guevara. 2008. Hole-filled SRTM for the globe Version 4, available from the CGIAR-CSI SRTM 90m Database: https://srtm.csi.cgiar.org.
- 0:cgiar
- 1:dem
- 2:elevation
- 3:geophysical
- 4:srtm
- 5:topography
- period:0
- 0:srtm
- 1:elevation
- 2:topography
- 3:dem
- 4:geophysical
- provider:NASA/CGIAR
- provider_url:https://srtm.csi.cgiar.org/
- sample:https://mw1.google.com/ges/dd/images/SRTM90_V4_sample.png
- 0:cgiar
- system:asset_size:18827626666
- system:time_end:951177600000
- system:time_start:950227200000
- system:visualization_0_bands:elevation
- system:visualization_0_gamma:1.6
- system:visualization_0_max:8000.0
- system:visualization_0_min:-100.0
- system:visualization_0_name:Elevation
- 0:cgiar
- 1:dem
- 2:elevation
- 3:geophysical
- 4:srtm
- 5:topography
- thumb:https://mw1.google.com/ges/dd/images/SRTM90_V4_thumb.png
- title:SRTM Digital Elevation Data Version 4
- type_name:Image
- visualization_0_bands:elevation
- visualization_0_max:8000.0
- visualization_0_min:-100.0
- visualization_0_name:Elevation
Mask the images with conditions:
Clear percentage of flood images >= 0.5
Not permanent water
def mask_flood(image):
clear_perc_mask = image.select("clear_perc").gte(0.5) # Clear percentage >= 50%
jrc_perm_water_mask = image.select("jrc_perm_water").eq(
0
) # 0 - non-water | 1 - permanent water
combined_mask = clear_perc_mask.And(jrc_perm_water_mask)
masked_image = image.updateMask(combined_mask)
return masked_image
flood_collection = flood_collection.map(mask_flood)
Compose the following bands into an image collection:
x, y, elevation as z and slope
SR_B1, SR_B2, SR_B3, SR_B4, SR_B5, SR_B7 from Landsat Collection
NDWI calculated using SR_B2 and SR_B4
duration, clear_perc, flooded from Global Flood Database
landsat_interest_bands = ["SR_B1", "SR_B2", "SR_B3", "SR_B4", "SR_B5", "SR_B7"]
flood_interest_bands = ["duration", "clear_perc", "flooded"]
elevation = elevation_image.select("elevation").rename("z")
slope = ee.Terrain.slope(elevation_image)
def filter_single_scene(image):
date_range = ee.DateRange(
image.get("system:time_start"), image.get("system:time_end")
)
return ee.Algorithms.If(
date_range.end().difference(date_range.start(), "days").gt(0), image, None
)
def compose_landsat(image):
geometry = image.geometry()
date_range = ee.DateRange(
image.get("system:time_start"), image.get("system:time_end")
)
landsat_image = (
landsat_collection.select(ee.List(landsat_interest_bands))
.filterDate(date_range.start(), date_range.end())
.filterBounds(geometry)
.median()
)
return ee.Algorithms.If(
landsat_image.bandNames().length(),
image.select(flood_interest_bands).addBands(
landsat_image,
ee.List(landsat_interest_bands),
),
None,
)
def add_elevation_bands(image: ee.Image):
return image.addBands(elevation).addBands(slope)
def set_default_projection(image: ee.Image):
projection = image.select("flooded").projection()
return image.setDefaultProjection(projection)
images = (
flood_collection.map(filter_single_scene, True)
.map(compose_landsat, True)
.map(add_elevation_bands)
.map(set_default_projection)
)
print(f"Number of flood event images left: {images.size().getInfo()}")
images.first()
Number of flood event images left: 848
- type:Image
- id:GLOBAL_FLOOD_DB/MODIS_EVENTS/V1/DFO_1586_From_20000218_to_20000301
- version:1685079690200045
- id:duration
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:65535
- min:0
- precision:int
- 0:5686
- 1:8984
- id:clear_perc
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- precision:float
- 0:5686
- 1:8984
- id:flooded
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:255
- min:0
- precision:int
- 0:5686
- 1:8984
- id:SR_B1
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:65535
- min:0
- precision:double
- id:SR_B2
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:65535
- min:0
- precision:double
- id:SR_B3
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:65535
- min:0
- precision:double
- id:SR_B4
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:65535
- min:0
- precision:double
- id:SR_B5
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:65535
- min:0
- precision:double
- id:SR_B7
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:65535
- min:0
- precision:double
- id:z
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:32767
- min:-32768
- precision:int
- 0:160302
- 1:53434
- 0:-141245
- 1:-34516
- id:slope
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:137.20418491999513
- 3:0
- 4:-0.002245788210298804
- 5:-17.514902252120372
- type:PixelType
- max:90
- min:0
- precision:float
- id:1586
- cc:AUS
- composite_type:3Day
- countries:Australia
- dfo_centroid_x:143.6978
- dfo_centroid_y:-31.268059
- dfo_country:Australia
- dfo_dead:1
- dfo_displaced:200
- dfo_main_cause:Monsoonal rain
- dfo_other_country:NA
- dfo_severity:2
- dfo_validation_type:News
- gfd_country_code:['AS']
- gfd_country_name:['AUSTRALIA']
- glide_index:NA
- otsu_sample_res:231.66
- slope_threshold:5
- system:asset_size:8988914
- type:LinearRing
- 0:148.77567883770345
- 1:-37.692190384571404
- 0:149.9754567941598
- 1:-37.692185550952246
- 0:149.9749728427817
- 1:-17.513770729195038
- 0:149.17465716800353
- 1:-17.51377546120134
- 0:147.97772223305088
- 1:-17.512579461230274
- 0:146.3818089465066
- 1:-17.512579422994158
- 0:144.7858956959944
- 1:-17.512579420723743
- 0:143.1899824714047
- 1:-17.512579400063284
- 0:141.59406922326394
- 1:-17.512579402044587
- 0:139.99815600423668
- 1:-17.5125794432258
- 0:138.40224272344366
- 1:-17.512579385808014
- 0:137.20294874528088
- 1:-17.51377068144409
- 0:137.20246484353038
- 1:-37.69218554664975
- 0:139.59917774865693
- 1:-37.692190354915816
- 0:141.1950909277643
- 1:-37.69219034319341
- 0:143.58896079571315
- 1:-37.69219036366114
- 0:145.18487408134828
- 1:-37.69219039048359
- 0:147.17976561083674
- 1:-37.692190365837604
- 0:148.77567883770345
- 1:-37.692190384571404
- system:index:DFO_1586_From_20000218_to_20000301
- system:time_end:951868800000
- system:time_start:950832000000
- threshold_b1b2:0.711
- threshold_b7:1815.18
- threshold_type:otsu
def to_features(image: ee.Image):
# Reduce resolution of bands to a common scale
image = image.addBands(
image.select([*landsat_interest_bands, "z", "slope"]).reduceResolution(
reducer=ee.Reducer.mean()
),
overwrite=True,
)
return image.stratifiedSample(
classBand="flooded",
numPoints=50,
dropNulls=True,
scale=250,
tileScale=4,
region=image.geometry(),
geometries=False,
projection=image.projection(),
)
dataset = images.map(to_features).flatten()
# geemap.ee_export_vector_to_drive(dataset, description="flood_dataset", fileFormat="csv")
Extract the details of flood events
interest_properties = {
"system:index": "id",
"dfo_country": "primary_country",
"dfo_severity": "severity",
"system:time_start": "start_date",
"system:time_end": "end_date",
"dfo_centroid_x": "center_lon",
"dfo_centroid_y": "center_lat",
"dfo_main_cause": "main_cause",
"dfo_dead": "dead",
"dfo_displaced": "displaced",
}
def extract_properties(image):
old_keys = ee.List(list(interest_properties.keys()))
new_keys = ee.List(list(interest_properties.values()))
props: ee.Dictionary = image.toDictionary(old_keys).rename(old_keys, new_keys, True)
mask = image.select("flooded").gt(0)
area = (
ee.Image.pixelArea()
.mask(mask)
.reduceRegion(
reducer=ee.Reducer.sum(),
scale=250,
geometry=image.geometry(),
maxPixels=412598311,
crs=image.projection(),
)
).get("area")
props = props.set("area", area)
return ee.Feature(None, props)
flood_properties = ee.FeatureCollection(flood_collection.map(extract_properties))
# geemap.ee_to_csv(flood_properties, filename="../data/asg3/flood_props.csv")
Data Cleaning¶
Checking for data distribution and null entry
df_flood_props = pd.read_csv("../data/asg3/flood_props.csv", index_col="id").reset_index()
display(df_flood_props.head(), df_flood_props.describe(), df_flood_props.isna().sum())
| id | area | center_lat | center_lon | dead | displaced | end_date | main_cause | primary_country | severity | start_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DFO_1586_From_20000218_to_20000301 | 3.167333e+08 | -31.268059 | 143.697800 | 1 | 200 | 951868800000 | Monsoonal rain | Australia | 2.0 | 950832000000 |
| 1 | DFO_1587_From_20000217_to_20000311 | 2.502000e+08 | -15.782624 | 47.295670 | 200 | 800000 | 952732800000 | Tropical cyclone | Madagascar | 1.0 | 950745600000 |
| 2 | DFO_1595_From_20000405_to_20000425 | 8.910377e+07 | 46.763746 | 22.415404 | 10 | 623 | 956620800000 | Snowmelt | Romania | 2.0 | 954892800000 |
| 3 | DFO_1614_From_20000711_to_20000810 | 4.397571e+09 | 11.242567 | 105.063841 | 33 | 25000 | 965865600000 | Monsoonal rain | Thailand | 1.0 | 963273600000 |
| 4 | DFO_1627_From_20000830_to_20000910 | 9.194628e+08 | 43.773883 | 132.057679 | 31 | 14140 | 968544000000 | Tropical cyclone | China | 1.0 | 967593600000 |
| area | center_lat | center_lon | dead | displaced | end_date | severity | start_date | |
|---|---|---|---|---|---|---|---|---|
| count | 9.130000e+02 | 913.000000 | 913.000000 | 913.000000 | 9.130000e+02 | 9.130000e+02 | 913.000000 | 9.130000e+02 |
| mean | 6.953646e+09 | 15.395235 | 38.340958 | 180.046002 | 2.674868e+05 | 1.224903e+12 | 1.297371 | 1.223187e+12 |
| std | 1.188170e+10 | 24.050427 | 78.687131 | 3332.691202 | 1.995806e+06 | 1.472525e+11 | 0.392520 | 1.474857e+11 |
| min | 0.000000e+00 | -45.953281 | -156.215507 | 0.000000 | 0.000000e+00 | 9.518688e+11 | 1.000000 | 9.507456e+11 |
| 25% | 7.060815e+08 | 0.058024 | -13.967824 | 0.000000 | 3.000000e+01 | 1.103328e+12 | 1.000000 | 1.102291e+12 |
| 50% | 2.223036e+09 | 18.877688 | 55.552555 | 5.000000 | 3.500000e+03 | 1.197245e+12 | 1.000000 | 1.195085e+12 |
| 75% | 7.381296e+09 | 32.886368 | 103.415798 | 27.000000 | 4.000000e+04 | 1.330301e+12 | 1.500000 | 1.329696e+12 |
| max | 9.854770e+10 | 68.000870 | 178.075692 | 100000.000000 | 4.000000e+07 | 1.544400e+12 | 2.000000 | 1.543968e+12 |
id 0 area 0 center_lat 0 center_lon 0 dead 0 displaced 0 end_date 0 main_cause 0 primary_country 0 severity 0 start_date 0 dtype: int64
df_flood_data = pd.read_csv("../data/asg3/flood_dataset.csv", index_col="system:index").reset_index()
display(df_flood_data.head(), df_flood_data.describe(), df_flood_data.isna().sum())
| system:index | SR_B1 | SR_B2 | SR_B3 | SR_B4 | SR_B5 | SR_B7 | clear_perc | duration | flooded | slope | z | .geo | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | DFO_1586_From_20000218_to_20000301_0 | 38182.0 | 29077.0 | 34219.0 | 22596.0 | 21604.0 | 18202.0 | 1.0 | 0 | 0 | 1.317596 | 119.0 | {"type":"MultiPoint","coordinates":[]} |
| 1 | DFO_1586_From_20000218_to_20000301_1 | 10304.0 | 12775.0 | 16730.0 | 20689.0 | 24540.0 | 21194.0 | 1.0 | 0 | 0 | 0.154581 | 151.0 | {"type":"MultiPoint","coordinates":[]} |
| 2 | DFO_1586_From_20000218_to_20000301_2 | 14348.0 | 17113.0 | 19445.0 | 20881.0 | 24473.0 | 22067.0 | 1.0 | 0 | 0 | 0.000000 | 29.0 | {"type":"MultiPoint","coordinates":[]} |
| 3 | DFO_1586_From_20000218_to_20000301_3 | 65535.0 | 65535.0 | 65535.0 | 31902.0 | 25274.0 | 20551.0 | 1.0 | 0 | 0 | 1.292620 | 47.0 | {"type":"MultiPoint","coordinates":[]} |
| 4 | DFO_1586_From_20000218_to_20000301_4 | 12960.0 | 13637.0 | 14640.0 | 16029.0 | 16719.0 | 14145.0 | 1.0 | 0 | 0 | 0.240548 | 65.0 | {"type":"MultiPoint","coordinates":[]} |
| SR_B1 | SR_B2 | SR_B3 | SR_B4 | SR_B5 | SR_B7 | clear_perc | duration | flooded | slope | z | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 80354.000000 | 80354.000000 | 80354.000000 | 80354.000000 | 80354.000000 | 80354.000000 | 80354.000000 | 80354.000000 | 80354.000000 | 80354.000000 | 80354.000000 |
| mean | 20092.456841 | 19307.969585 | 20190.967046 | 19047.787428 | 15720.140640 | 13064.029526 | 0.983093 | 2.359957 | 0.487891 | 1.249266 | 385.527740 |
| std | 18827.824469 | 16747.044276 | 17624.727589 | 9175.595795 | 7648.333485 | 4669.589228 | 0.028030 | 5.200521 | 0.499856 | 2.410576 | 761.573009 |
| min | 1347.000000 | 3866.500000 | 5128.500000 | 5953.000000 | 6992.500000 | 7201.000000 | 0.625000 | 0.000000 | 0.000000 | 0.000000 | -123.000000 |
| 25% | 9200.000000 | 9996.000000 | 9938.000000 | 12922.750000 | 10818.000000 | 9401.625000 | 0.972222 | 0.000000 | 0.000000 | 0.221696 | 26.000000 |
| 50% | 10993.000000 | 12044.000000 | 12450.000000 | 17228.750000 | 14452.000000 | 11975.000000 | 1.000000 | 0.000000 | 0.000000 | 0.490689 | 135.000000 |
| 75% | 18317.000000 | 18206.375000 | 19191.750000 | 22198.000000 | 18196.875000 | 15736.000000 | 1.000000 | 3.000000 | 1.000000 | 1.305663 | 372.000000 |
| max | 65535.000000 | 65535.000000 | 65535.000000 | 65535.000000 | 65535.000000 | 65535.000000 | 1.000000 | 145.000000 | 1.000000 | 50.952850 | 6616.000000 |
system:index 0 SR_B1 0 SR_B2 0 SR_B3 0 SR_B4 0 SR_B5 0 SR_B7 0 clear_perc 0 duration 0 flooded 0 slope 0 z 0 .geo 0 dtype: int64
with open("../data/asg3/countries.geojson", encoding="utf-8") as f:
countries_boundary = geojson.load(f)
countries_ADM = [
feature["properties"]["ADMIN"] for feature in countries_boundary["features"]
]
Verify all country is valid with ADM standard
def check_country_exists_ADM(countries):
return pd.DataFrame(
{
"primary_country": countries,
"exists_in_ADM": countries.isin(countries_ADM),
}
)
df_temp = check_country_exists_ADM(
pd.Series(df_flood_props["primary_country"].unique())
)
df_temp[df_temp["exists_in_ADM"] == False]
| primary_country | exists_in_ADM | |
|---|---|---|
| 14 | USA | False |
| 18 | Africa | False |
| 23 | Texas | False |
| 24 | Columbia | False |
| 33 | Burma | False |
| 41 | UK | False |
| 66 | Bosnia-Herzegovina | False |
| 68 | Venezulea | False |
| 83 | Serbia-Montenegro | False |
| 92 | Democratic Republic of Congo | False |
| 93 | Tanzania | False |
| 94 | Guatamala | False |
| 97 | Serbia | False |
| 103 | Inda | False |
| 106 | Tasmania | False |
| 109 | Gaza | False |
| 110 | Scotland | False |
| 111 | The Gambia | False |
| 116 | Kazahkstan | False |
| 117 | Uruguay, | False |
| 119 | Bahamas | False |
Transform invalid countries' value according to ADM standard
country_to_adm = {
"USA": "United States of America",
"UK": "United Kingdom",
"Burma": "Myanmar",
"Tanzania": "United Republic of Tanzania",
"Columbia": "Colombia",
"Bosnia-Herzegovina": "Bosnia and Herzegovina",
"Guatamala": "Guatemala",
"Serbia": "Republic of Serbia",
"Africa": "Central African Republic", # Africa is a continent, not a country
"Texas": "United States of America", # Texas is a state in the USA
"Venezulea": "Venezuela",
"Serbia-Montenegro": "Republic of Serbia", # No longer exists as a single country
"Inda": "India",
"Democratic Republic of Congo": "Democratic Republic of the Congo",
"Tasmania": "Australia", # Tasmania is a state in Australia
"The Gambia": "Gambia",
"Scotland": "United Kingdom", # Scotland is part of the United Kingdom
"Gaza": "Palestine", # Gaza is a region in Palestine
"Kazahkstan": "Kazakhstan",
"Uruguay,": "Uruguay",
"Bahamas": "The Bahamas",
}
def process_country_name(country):
processed_country = country_to_adm.get(country)
return processed_country if processed_country else country
df_flood_props_cleaned = df_flood_props.copy()
df_flood_props_cleaned["primary_country"] = df_flood_props["primary_country"].apply(
process_country_name
)
df_temp = check_country_exists_ADM(
pd.Series(df_flood_props_cleaned["primary_country"].unique())
)
df_temp.sort_values("exists_in_ADM")
| primary_country | exists_in_ADM | |
|---|---|---|
| 0 | Australia | True |
| 80 | Republic of Serbia | True |
| 79 | Slovakia | True |
| 78 | Ecuador | True |
| 77 | Jamaica | True |
| ... | ... | ... |
| 31 | Iran | True |
| 30 | Germany | True |
| 29 | South Africa | True |
| 40 | United Kingdom | True |
| 110 | Mongolia | True |
111 rows × 2 columns
# df_flood_props_cleaned.to_csv("../data/asg3/flood_props_cleaned.csv")
Data Visualization¶
causes = [x.lower() for x in df_flood_props_cleaned["main_cause"].to_list()]
categories = {
"Heavy rain": ["rain", "monsoon", "torrential"],
"Dam break/release": ["dam", "levy", "release"],
"Snowmelt": ["snow"],
"Ice jam": ["ice"],
"Tropical storm": ["tropical", "typhoon", "hurricane"],
}
df_flood_cause = pd.DataFrame(
data={
"event": list(categories.keys()) + ["Other"],
"count": np.zeros(len(categories) + 1, dtype=int),
"matched": [[] for _ in range(len(categories) + 1)],
}
)
for event in causes:
matched = False
for category, keywords in categories.items():
if any(keyword in event for keyword in keywords):
idx = df_flood_cause.index[df_flood_cause["event"] == category].tolist()[0]
df_flood_cause.at[idx, "count"] += 1
df_flood_cause.at[idx, "matched"].append(event)
matched = True
if not matched:
idx = df_flood_cause.index[df_flood_cause["event"] == "Other"].tolist()[0]
df_flood_cause.at[idx, "count"] += 1
df_flood_cause.at[idx, "matched"].append(event)
fig = px.bar(
df_flood_cause,
x="event",
y="count",
color="count",
text="count",
)
fig.update_layout(title=dict(text="Main Cause of Flood Events", x=0.5))
fig.show(renderer="notebook")
fig = px.pie(
df_flood_data["flooded"].value_counts().to_frame().reset_index(),
values="count",
names=["Non-water", "Flooded"],
)
fig.update_layout(
title=dict(text="Ratio of Flooded Area Against Non-Water Area in Dataset", x=0.5),
margin_b=20,
)
fig.show(renderer="notebook")
Global flood occurrence (only primary influenced country)
df_flood_by_country = pd.DataFrame(
df_flood_props_cleaned["primary_country"].value_counts().reset_index(name="counts")
)
scattergeo_trace = go.Scattergeo(
lat=df_flood_props_cleaned["center_lat"],
lon=df_flood_props_cleaned["center_lon"],
mode="markers",
marker=dict(
size=12,
opacity=0.8,
autocolorscale=False,
symbol="triangle-down",
colorscale="Reds",
cmin=0,
color=df_flood_props_cleaned["severity"],
cmax=df_flood_props_cleaned["severity"].max(),
colorbar_title="Severity",
colorbar_x=1.2,
),
hoverinfo="skip",
)
choropleth_trace = go.Choropleth(
locations=df_flood_by_country["primary_country"],
locationmode="country names",
z=df_flood_by_country["counts"],
colorscale="amp",
colorbar_title="Occurence",
hoverlabel_namelength=0,
)
fig = go.Figure(data=[scattergeo_trace, choropleth_trace])
fig.data[0].showlegend = False
fig.update_layout(
title_text="Flood Severity and Occurrence",
title_x=0.5,
geo=dict(
showland=True,
landcolor="rgb(95, 138, 92)",
showcountries=True,
countrycolor="rgb(255, 255, 255)",
showocean=True,
oceancolor="rgb(158,202,225)",
projection_type="orthographic",
),
margin={"b": 15, "l": 10, "r": 0, "t": 70},
shapes=list(
[
dict(
fillcolor="rgb(95, 138, 92)",
layer="below",
line={"dash": "dash"},
name="Country not primarily <br>affected by flood",
showlegend=True,
type="rect",
xref="paper",
x0=0.001,
x1=0.002,
yref="paper",
y0=0.001,
y1=0.002,
)
]
),
legend=dict(
yanchor="top",
y=1,
xanchor="left",
x=0,
bgcolor="LightSteelBlue",
bordercolor="Black",
borderwidth=1,
),
updatemenus=[
{
"buttons": [
{
"args": [
{"geo.projection.type": "orthographic"},
],
"label": "Orthographic",
"method": "relayout",
},
{
"args": [
{"geo.projection.type": "equirectangular"},
],
"label": "Equirectangular",
"method": "relayout",
},
],
"direction": "left",
"showactive": True,
"type": "buttons",
"x": 0,
"xanchor": "left",
"y": 1.15,
"yanchor": "top",
}
],
)
fig.show(renderer="notebook")
fig = px.scatter(
df_flood_props_cleaned,
x="displaced",
y="dead",
size="area",
color="primary_country",
hover_name="id",
)
fig.update_layout(
title=dict(text="Estimated Displaced Against Fatalities Due to Flood Event", x=0.5),
height=750,
margin_t=100,
)
fig.show(renderer="notebook")
Select the image of 2010 Pakistan floods for visualizing
target_flood = "DFO_2507_From_20040620_to_20041007"
max_pixels = 10000
scale = 25000
target_image = (
images.filter(ee.Filter.eq("system:index", target_flood)).first().unmask()
)
# Extract the image as features (x, y, z, flooded) to investigate the relationship between elevation/slope and flood
target_image_features = ee.FeatureCollection(
target_image.reduceResolution(reducer=ee.Reducer.mean(), maxPixels=max_pixels)
.addBands(ee.Image.pixelCoordinates(projection=target_image.projection()))
.select(["x", "y", "z", "flooded"])
.reduceRegion(
ee.Reducer.toCollection(["x", "y", "z", "flooded"]),
scale=scale,
geometry=target_image.geometry().bounds(),
crs=target_image.projection(),
)
.get("features")
)
display(target_image)
print(f"Number of points: {target_image_features.size().getInfo()}")
- type:Image
- id:GLOBAL_FLOOD_DB/MODIS_EVENTS/V1/DFO_2507_From_20040620_to_20041007
- version:1685079779514552
- id:duration
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:73.38337555972372
- 3:0
- 4:-0.002245788210298804
- 5:32.792999446783135
- type:PixelType
- max:65535
- min:0
- precision:int
- 0:11955
- 1:7630
- id:clear_perc
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:73.38337555972372
- 3:0
- 4:-0.002245788210298804
- 5:32.792999446783135
- type:PixelType
- precision:float
- 0:11955
- 1:7630
- id:flooded
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:73.38337555972372
- 3:0
- 4:-0.002245788210298804
- 5:32.792999446783135
- type:PixelType
- max:255
- min:0
- precision:int
- 0:11955
- 1:7630
- id:SR_B1
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:73.38337555972372
- 3:0
- 4:-0.002245788210298804
- 5:32.792999446783135
- type:PixelType
- max:65535
- min:0
- precision:double
- id:SR_B2
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:73.38337555972372
- 3:0
- 4:-0.002245788210298804
- 5:32.792999446783135
- type:PixelType
- max:65535
- min:0
- precision:double
- id:SR_B3
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:73.38337555972372
- 3:0
- 4:-0.002245788210298804
- 5:32.792999446783135
- type:PixelType
- max:65535
- min:0
- precision:double
- id:SR_B4
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:73.38337555972372
- 3:0
- 4:-0.002245788210298804
- 5:32.792999446783135
- type:PixelType
- max:65535
- min:0
- precision:double
- id:SR_B5
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:73.38337555972372
- 3:0
- 4:-0.002245788210298804
- 5:32.792999446783135
- type:PixelType
- max:65535
- min:0
- precision:double
- id:SR_B7
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:73.38337555972372
- 3:0
- 4:-0.002245788210298804
- 5:32.792999446783135
- type:PixelType
- max:65535
- min:0
- precision:double
- id:z
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:73.38337555972372
- 3:0
- 4:-0.002245788210298804
- 5:32.792999446783135
- type:PixelType
- max:32767
- min:-32768
- precision:int
- 0:160302
- 1:53434
- 0:-112827
- 1:-12115
- id:slope
- crs:EPSG:4326
- 0:0.002245788210298804
- 1:0
- 2:73.38337555972372
- 3:0
- 4:-0.002245788210298804
- 5:32.792999446783135
- type:PixelType
- max:90
- min:0
- precision:float
- id:2507
- cc:CHN, IND, MMR, NPL, BGD, BTN
- composite_type:3Day
- countries:China, India, Myanmar (Burma), Nepal, Bangladesh, Bhutan
- dfo_centroid_x:85.891802
- dfo_centroid_y:25.498908
- dfo_country:India
- dfo_dead:3000
- dfo_displaced:40000000
- dfo_main_cause:Monsoonal rain
- dfo_other_country:Bangladesh
- dfo_severity:2
- dfo_validation_type:News
- gfd_country_code:['TH', 'BM', 'BG', 'IN', 'CH', 'BT', 'UU', 'NP']
- gfd_country_name:['THAILAND', 'BURMA', 'BANGLADESH', 'INDIA', 'CHINA', 'BHUTAN', 'In dispute BHUTAN/CHINA', 'In dispute CHINA/INDIA', 'NEPAL', 'In dispute INDIA/NEPAL']
- glide_index:NA
- slope_threshold:5
- system:asset_size:58741567
- type:LinearRing
- 0:91.00228587710755
- 1:32.79412626840574
- 0:87.6465168385273
- 1:32.7941262869687
- 0:83.45180557365863
- 1:32.794126257013616
- 0:80.09603652705451
- 1:32.794126296226565
- 0:75.90132526971159
- 1:32.79412626260718
- 0:73.38182225783144
- 1:32.79412176803111
- 0:73.38216204890047
- 1:15.656504148386379
- 0:75.48185411728672
- 1:15.650523501363097
- 0:79.25709428358672
- 1:15.646932740263926
- 0:83.0323343845583
- 1:15.65052350640998
- 0:85.54916121803853
- 1:15.65531155198055
- 0:88.0659879609319
- 1:15.650523486946916
- 0:91.8412281293297
- 1:15.646932731827567
- 0:95.6164683333735
- 1:15.650523541996755
- 0:98.13329506611277
- 1:15.655311500554676
- 0:100.2329870589596
- 1:15.653303161209505
- 0:100.23332694632873
- 1:32.79412181427747
- 0:93.93858380512515
- 1:32.79412626662085
- 0:91.00228587710755
- 1:32.79412626840574
- system:index:DFO_2507_From_20040620_to_20041007
- system:time_end:1097107200000
- system:time_start:1087689600000
- threshold_b1b2:0.7
- threshold_b7:675
- threshold_type:standard
Number of points: 9120
# geemap.ee_to_csv(
# target_image_features, filename=f"../data/asg3/{target_flood}_data.csv"
# )
Plot 3D DEM of target image with flood area colored
target_image_data = pd.read_csv(f"../data/asg3/{target_flood}_data.csv").to_numpy()
flooded = target_image_data[:, 0]
x = target_image_data[:, 1]
y = target_image_data[:, 2]
z = target_image_data[:, -1]
xmin, xmax = np.min(x), np.max(x)
ymin, ymax = np.min(y), np.max(y)
xstep, ystep = (
np.round((xmax - xmin) / np.unique(x).shape).item(),
np.round((ymax - ymin) / np.unique(y).shape).item(),
)
u = np.arange(start=xmin, stop=xmax, step=xstep)
v = np.arange(start=ymin, stop=ymax, step=ystep)
X, Y = np.meshgrid(u, v)
Z = scipy.interpolate.griddata(
(x, y), z, (X, Y), method="nearest", fill_value=0, rescale=True
)
F = scipy.interpolate.griddata(
(x, y), flooded, (X, Y), method="nearest", fill_value=0, rescale=True
)
Fmin, Fmax = np.min(F), np.max(F)
fig = go.Figure(
data=[
go.Surface(
z=Z, colorscale="Plotly3", showscale=True, colorbar_title="Elevation"
),
go.Surface(
z=Z + 150,
surfacecolor=F,
cmin=Fmin,
cmax=Fmax,
colorscale=[
[0, "rgba(7, 148, 242, 0.0)"],
[0.3, "rgba(7, 148, 242, 1.0)"],
[1, "rgba(7, 148, 242, 1.0)"],
],
colorbar=dict(title="Flood mean", x=-0.1),
),
]
)
fig.update_traces(
contours_z=dict(
show=True,
usecolormap=True,
project_z=True,
color="rgba(255, 0, 0, 0.5)",
)
)
fig.update_layout(
scene_camera_eye=dict(x=1.5, y=1.5, z=1.25),
margin=dict(l=0, r=0, t=0, b=30),
title=dict(text=f"3D DEM of {target_flood}", x=0.5, y=0.95),
updatemenus=[
dict(
type="buttons",
buttons=[
dict(
label="Toggle Elevation Surface",
method="update",
args=[
{
"visible": [True, True],
"contours.z.usecolormap": [True, True],
},
],
args2=[
{
"visible": [False, True],
"contours.z.usecolormap": [False, False],
},
],
),
],
direction="left",
showactive=True,
x=-0.1,
xanchor="left",
y=1.1,
yanchor="top",
)
],
)
fig.show(renderer="notebook")
Visualize dataset and target flood event with satellite view
vis_params = {"min": 0, "max": 10, "palette": ["c3effe", "1341e8", "051cb0", "001133"]}
Map.addLayer(
flood_collection.select("jrc_perm_water").sum().gte(1).selfMask(),
{"min": 0, "max": 1, "palette": "c3effe"},
"JRC Permanent Water",
)
Map.addLayer(
flood_collection.select("flooded").sum().selfMask(),
vis_params,
"GFD Satellite Observed Flood Plain",
)
Map.add_colorbar_branca(
colors=vis_params["palette"],
vmin=vis_params["min"],
vmax=vis_params["max"],
layer_name="",
caption="Flood occurrence",
discrete=True,
)
Map.addLayer(
images.select("flooded").sum().selfMask(),
vis_params,
"Masked GFD Satellite Observed Flood Plain",
)
Map.addLayer(
target_image.select("flooded").selfMask(),
vis_params,
"Selected flood event",
)
Map.addLayer(
ee.FeatureCollection(ee.Feature(target_image.geometry().bounds())).style(
color="red", fillColor="00000000"
),
{},
"Selected flood event boundary",
)
Map
Map(center=[0, 0], controls=(WidgetControl(options=['position', 'transparent_bg'], widget=SearchDataGUI(childr…
Feature Engineering¶
Initial selected features
features = {
"SR_B1": "Blue",
"SR_B2": "Green",
"SR_B3": "Red",
"SR_B4": "NIR",
"SR_B5": "SWIR_1",
"SR_B7": "SWIR_2",
"slope": "Slope",
"z": "Elevation",
}
x = df_flood_data[features.keys()].rename(columns=features)
y = df_flood_data["flooded"]
Feature Creation¶
Transformed Index
- NDWI - Normalized Difference Water Index
- MBI - Modified Bare Soil Index
- NDVI - Normalized Difference Vegetation Index
- WRI - Water Ratio Index
- AWEI - Automated Water Extraction Index
- SI - Shadow Index
- NWI - New Water Index
x["NDWI"] = (x["Green"] - x["NIR"]) / (x["Green"] + x["NIR"])
x["MBI"] = (
(x["SWIR_1"] - x["SWIR_2"] - x["NIR"]) / (x["SWIR_1"] + x["SWIR_2"] + x["NIR"])
) + 0.5
x["NDVI"] = (x["NIR"] - x["Red"]) / (x["NIR"] + x["Red"])
x["WRI"] = (x["Green"] + x["Red"]) / (x["NIR"] + x["SWIR_2"])
x["AWEI"] = (
x["Blue"] + 2.5 * x["Green"] - 1.5 * (x["NIR"] + x["SWIR_1"]) - 0.25 * x["SWIR_2"]
)
x["SI"] = (1 - x["Red"]) * (1 - x["Blue"]) * (1 - x["Green"])
x["NWI"] = (x["Blue"] - (x["NIR"] + x["SWIR_1"] + x["SWIR_2"])) / (
x["Blue"] + (x["NIR"] + x["SWIR_1"] + x["SWIR_2"])
)
Correlation heatmap of initial selected features
corr = x.corr().round(4)
fig = px.imshow(corr, text_auto=True, aspect="auto")
fig.update_layout(title=dict(text="Correlation of Features", x=0.5))
fig.show(renderer="notebook")
Use LightGBM as base model for SHAP
forest = lgb.LGBMClassifier(
objective="binary",
n_estimators=1000,
learning_rate=0.01,
deterministic=True,
device_type="cpu",
verbose=0,
)
forest
LGBMClassifier(deterministic=True, device_type='cpu', learning_rate=0.01,
n_estimators=1000, objective='binary', verbose=0)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LGBMClassifier(deterministic=True, device_type='cpu', learning_rate=0.01,
n_estimators=1000, objective='binary', verbose=0)X_train, X_test, y_train, y_test = cuml_train_test_split(
x, y, test_size=0.3, shuffle=True, stratify=y
)
forest.fit(X_train.to_numpy(), y_train.to_numpy())
full_clustering_path = Path("../data/asg3/f_clustering.npy")
shap_values_path = Path("../data/asg3/shap_values.npy")
if full_clustering_path.exists():
clustering = np.load(full_clustering_path)
else:
clustering = shap.utils.hclust(X_train.to_numpy(), y_train.to_numpy())
np.save(full_clustering_path, clustering)
if shap_values_path.exists():
shap_values = np.load(shap_values_path)
else:
masker = shap.maskers.Partition(X_train.to_numpy(), clustering=clustering)
explainer = PermutationExplainer(
forest.predict_proba, masker, data=X_train, masker_type="partition"
)
shap_values = explainer.shap_values(X_test.to_numpy())
np.save(shap_values_path, shap_values)
All Features
expected_value = np.mean(forest.predict_proba(X_test.to_numpy()), axis=0)
explanation = shap.Explanation(
shap_values,
data=X_test.to_numpy(),
base_values=np.full(shape=np.sum(shap_values, axis=1).shape, fill_value=expected_value),
feature_names=list(x.columns.values),
)
fig, axes = plt.subplots(1,2)
axes[0].set_title("Feature Importance and Clustering \nBased on SHAP Values")
shap.plots.bar(explanation[...,1], clustering=clustering, max_display=len(x.columns), ax=axes[0], show=False)
shap.decision_plot(expected_value[1], shap_values[...,1][:10], feature_names=list(x.columns.values), show=False)
axes[1] = plt.gca()
axes[1].set_title("Complexity of Model for Making Prediction")
fig.set_dpi(1000)
fig.suptitle("Analysis for All Features", fontsize=16)
fig.tight_layout()
fig.set_size_inches((10, 8))
Selected Features:
- AWEI
- NIR
- "Slope
- "Elevation
final_features = ["AWEI", "NIR", "Slope", "Elevation"]
X_train, X_temp, y_train, y_temp = train_test_split(
x[final_features], y, test_size=0.3, stratify=y
)
X_val, X_test, y_val, y_test = train_test_split(
X_temp, y_temp, test_size=0.5, stratify=y_temp
)
np.savez(
"../data/asg3/complete_data1.npz",
X_train=X_train,
y_train=y_train,
X_test=X_test,
y_test=y_test,
X_val=X_val,
y_val=y_val,
)
data = np.load("../data/asg3/complete_data1.npz")
(
X_train,
X_test,
y_train,
y_test,
X_val,
y_val,
) = (
data["X_train"],
data["X_test"],
data["y_train"],
data["y_test"],
data["X_val"],
data["y_val"],
)
train_data = lgb.Dataset(X_train, y_train, params={"feature_pre_filter": False})
forest.fit(X_train, y_train)
explainer = shap.GPUTreeExplainer(forest)
clustering = shap.utils.hclust(X_train, y_train)
explanation = explainer(X_test)
shap_values = explainer.shap_values(X_test)
5it [00:16, 5.63s/it]
fig, axes = plt.subplots(1, 2, figsize=(10, 8))
axes[0].set_title("Feature Importance and Clustering \nBased on SHAP Values")
shap.plots.bar(
explanation,
clustering=clustering,
max_display=len(x.columns),
ax=axes[0],
show=False,
)
shap.decision_plot(
explainer.expected_value,
shap_values[:10],
feature_names=final_features,
link="logit",
show=False,
auto_size_plot=False,
)
axes[1] = plt.gca()
axes[1].set_title("Complexity of Model for Making Prediction")
fig.set_dpi(1000)
fig.suptitle("Analysis for Selected Features", fontsize=16)
fig.tight_layout()
fig.set_size_inches((10, 8))
Model Development & Hyperparameter Tuning¶
def evaluate_model(y_pred, y_test):
precision, recall, f1, support = precision_recall_fscore_support(
y_test, y_pred, average="binary"
)
accuracy = accuracy_score(y_test, y_pred)
auc = roc_auc_score(y_test, y_pred)
return [accuracy, precision, recall, f1, auc]
Logistic Regression model as linear model
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
lr_y_pred = logistic_model.predict(X_test)
lr_results = evaluate_model(lr_y_pred, y_test)
lr_results
LightGBM Model as non-linear model
lgb_model = lgb.cv(
{
"objective": "binary",
"metric": "auc",
"device_type": "cpu",
"verbose": -1,
"boosting_type": "gbdt",
"deterministic": True,
},
train_data,
nfold=5,
stratified=True,
shuffle=True,
metrics="auc",
return_cvbooster=True,
)
# Predict and evaluate
lgb_y_pred = (lgb_model["cvbooster"].predict(X_test)[0] > 0.5).astype("int")
lgb_results = evaluate_model(lgb_y_pred, y_test)
lgb_results
TabNet as deep learning model
device = "cuda" if torch.cuda.is_available() else "cpu"
tabnet = TabNetClassifier(device_name="cuda", verbose=0)
tabnet.fit(X_train, y_train, eval_set=[((X_val,y_val))])
tabnet_y_pred = tabnet.predict(X_test)
tabnet_results = evaluate_model(tabnet_y_pred, y_test)
Initial result comparison for model selection
metrics_df = pd.DataFrame(
{
"Metric": ["Accuracy", "Precision", "Recall", "F1 Score", "AUC"],
"Logistic Regression": lr_results,
"LightGBM": lgb_results,
"TabNet": tabnet_results,
}
)
metrics_df.to_csv("../data/asg3/initial_results.csv", index=False)
metrics_df.style.background_gradient(axis=1)
| Metric | Logistic Regression | LightGBM | TabNet | |
|---|---|---|---|---|
| 0 | Accuracy | 0.787208 | 0.854405 | 0.848183 |
| 1 | Precision | 0.792210 | 0.874955 | 0.865416 |
| 2 | Recall | 0.764326 | 0.818568 | 0.815678 |
| 3 | F1 Score | 0.778018 | 0.845823 | 0.839811 |
| 4 | AUC | 0.786666 | 0.853558 | 0.847414 |
Since LightGBM is the best model in term of selected metrics and computation time, it is used as final model for hypermeter tuning and classification
cv_study = optuna.create_study(
study_name="LightGBM_tuner_cv",
storage="sqlite:///../data/asg3/lightgbm_tuner_cv.db",
direction="maximize",
load_if_exists=True,
)
tuner_pickle_path = Path("../data/asg3/lightgbm_tuner_cv.pkl")
if tuner_pickle_path.exists():
# Load the tuner from the pickle file
with open(tuner_pickle_path, "rb") as f:
tuner = pickle.load(f)
else:
tuner = LightGBMTunerCV(
{
"objective": "binary",
"metric": "auc",
"device_type": "cpu",
"verbose": -1,
"boosting_type": "gbdt",
"deterministic": True,
},
train_data,
study=cv_study,
nfold=5,
stratified=True,
shuffle=True,
verbosity=-1,
show_progress_bar=True,
return_cvbooster=True,
model_dir="../data/asg3/models",
)
tuner.run()
with open(tuner_pickle_path, "wb") as f:
pickle.dump(tuner, f)
tuned_lgb = tuner.get_best_booster()
tuned_lgb_y_pred = (tuned_lgb.predict(X_test)[0] > 0.5).astype("int")
tuned_lgb_results = evaluate_model(tuned_lgb_y_pred, y_test)
0%| | 0/7 [00:00<?, ?it/s] 0%| | 0/20 [00:00<?, ?it/s] 0%| | 0/10 [00:00<?, ?it/s] 0%| | 0/6 [00:00<?, ?it/s] 0%| | 0/20 [00:00<?, ?it/s] 0%| | 0/5 [00:00<?, ?it/s]
Final result comparison
final_results = pd.read_csv("../data/asg3/initial_results.csv")
final_results["Tuned_LightGBM"] = tuned_lgb_results
final_results.to_csv("../data/asg3/final_results.csv")
final_results.style.background_gradient(axis=1)
| Metric | Logistic Regression | LightGBM | TabNet | Tuned_LightGBM | |
|---|---|---|---|---|---|
| 0 | Accuracy | 0.787208 | 0.854405 | 0.848183 | 0.864858 |
| 1 | Precision | 0.792210 | 0.874955 | 0.865416 | 0.882374 |
| 2 | Recall | 0.764326 | 0.818568 | 0.815678 | 0.834212 |
| 3 | F1 Score | 0.778018 | 0.845823 | 0.839811 | 0.857617 |
| 4 | AUC | 0.786666 | 0.853558 | 0.847414 | 0.864133 |
Model Evaluation¶
Draw a shape on map (preferable rectangle) or use existing Area of Interest (ROI)
Map
Map(bottom=1996.0, center=[29.53522956294847, 80.77148437500001], controls=(WidgetControl(options=['position',…
This cell will draw back the last ROI geometry and reuse the data if exists
bands = ["SR_B1", "SR_B2", "SR_B4", "SR_B5", "SR_B7", "z", "slope", "flooded"]
roi_geo_path = Path("../data/asg3/roi_geo.npy")
if Map.draw_last_feature:
geometry = Map.draw_last_feature.geometry()
bounds = geemap.ee_to_bbox(geometry)
np.save(roi_geo_path, bounds)
reduced_target_image = target_image.reduceResolution(
reducer=ee.Reducer.mean(),
)
np.save(
"../data/asg3/roi",
geemap.ee_to_numpy(
reduced_target_image,
region=geometry,
scale=250,
bands=bands,
),
)
Map.remove_drawn_features()
else:
if roi_geo_path.exists():
bounds = np.load(roi_geo_path).tolist()
roi_geo = ee.Geometry.Rectangle(coords=bounds)
Map.centerObject(roi_geo, zoom=10)
roi_geo = ee.FeatureCollection(roi_geo).style(
fillColor="#3181cc33", color="red", width=1
)
Map.addLayer(roi_geo, {}, "ROI Geometry")
Transform pixel values within ROI into dataset and perform classification with confidence level of 0.95
data: np.ndarray = np.load("../data/asg3/roi.npy")
rows, cols, bands_len = data.shape
array_2d = data.reshape(-1, bands_len)
coordinates = [(row, col) for row in range(rows) for col in range(cols)]
data_with_coords = [
{"x": x, "y": y, **{bands[i]: array_2d[idx, i] for i in range(bands_len)}}
for idx, (x, y) in enumerate(coordinates)
]
features = {
"SR_B1": "Blue",
"SR_B2": "Green",
"SR_B4": "NIR",
"SR_B5": "SWIR_1",
"SR_B7": "SWIR_2",
"slope": "Slope",
"z": "Elevation",
}
df_target_image = pd.DataFrame(data_with_coords).rename(columns=features)
x = df_target_image[features.values()]
y = df_target_image["flooded"]
x["AWEI"] = (
x["Blue"] + 2.5 * x["Green"] - 1.5 * (x["NIR"] + x["SWIR_1"]) - 0.25 * x["SWIR_2"]
)
confidence = 0.95
y_pred: np.ndarray = (tuned_lgb.predict(x[final_features])[0] > confidence).astype("int")
df_target_image["pred_flooded"] = y_pred
Plot the results as side by side image
def get_grid(df, label):
grid_x = df["x"].max() + 1
grid_y = df["y"].max() + 1
# Initialize an empty grid
grid = np.zeros((grid_x, grid_y, 3), dtype=np.float32)
# Populate the grid with colors
for _, row in df.iterrows():
x, y, color = int(row["x"]), int(row["y"]), row[label]
grid[x, y] = color
return grid
grid_ori = get_grid(df_target_image, "flooded")
grid_pred = get_grid(df_target_image, "pred_flooded")
fig = px.imshow(
np.array((grid_ori, grid_pred)),
facet_col=0,
aspect="auto",
height=600,
)
item_map = {
f"{i}": key for i, key in enumerate(["Original", "Prediction (Confidence=0.95)"])
}
fig.for_each_annotation(lambda a: a.update(text=item_map[a.text.split("=")[1]]))
fig.update_layout(
yaxis=dict(scaleanchor="x", scaleratio=1.5),
title="Comparison of Original and Predicted Flooded Areas",
title_x=0.5,
)
fig.show(renderer="notebook")